*Take OSHA inspection data, and create a panel at state-year level for various outcomes
/*
global rawdata_imis /export/projects/mtoffel_project/matt/rawdata/osha/
global melissa_data_imis /export/projects/mtoffel_project/melissa/federal_osha/data/

global wrongful_discharge /export/projects/mtoffel_project/matt/wrongful_discharge_laws/

global replication_rawdata /export/projects/mtoffel_project/matt/wrongful_discharge_laws/replication/data/raw/
global replication_clean /export/projects/mtoffel_project/matt/wrongful_discharge_laws/replication/data/cleaned/
*/	
global project "C:/Users/msj22/Dropbox/research/wrongful_discharge/replication_May_2022/"
global replication_rawdata "${project}data/"
global replication_cleandata "${project}intermediate_dtas/"

*Violations data
	u ${replication_rawdata}osha_violation_accessed20140223.dta, clear
		*Type of violation
			gen serious_viol = viol_type == "S"
			gen willful_viol = viol_type == "W"
			gen repeat_viol = viol_type == "R"
			gen u_viol = viol_type == "U"
			gen other_viol = viol_type == "O"

		*Collapse data
			collapse (sum) *_viol initial_penalty* gravity (min) issuance_date, by(activity_nr) 
		*NOTE: This gives total # of each kind of violation, total penalties, and AVERAGE gravity of violations for each inspection
			local viol_types "serious willful repeat u other"
			foreach type in `viol_types' {
				lab var `type'_viol "number of `type' violations"
			}
			gen total_viol = serious_viol+willful_viol+repeat_viol+u_viol+other_viol
			gen repeat_or_will_viol = repeat_viol+willful_viol
			lab var total_viol "Total violations"
			lab var gravity "Sum of gravity of violations"
			lab var repeat_or_will_viol "Number of repeat or willful violations"
		tempfile viols
		save `viols', replace
		
		
*Bring in the IMIS inspection data
	u ${replication_rawdata}osha_inspection_accessed_071612_revised15Dec14, clear
	
	cap drop *year*
	gen open_year = year(open_date)
	drop if open_date==.
	*Drop errors in states
		drop if length(site_state)!=2
		drop if site_state=="UK" | site_state=="VI" | site_state=="GU" | site_state=="PR" | site_state=="PI" | site_state=="MP" | site_state=="MQ" | site_state=="JQ" | site_state=="FN" | site_state=="CZ" | site_state=="AS"
	*Mark state-plan states
		gen state_run_office = site_state=="AK" | site_state=="AZ" | site_state=="CA" | site_state=="IN" | site_state=="IA" | site_state=="HI"| site_state=="KY" | site_state=="MD" | site_state=="MI" | site_state=="MN" | site_state=="NV" | site_state=="NM" | site_state=="NC" | site_state=="OR" | site_state=="PR" | site_state=="SC" | site_state=="TN" | site_state=="UT"| site_state=="VA" | site_state=="VT" | site_state=="VI" | site_state=="WA" | site_state=="WY"

	*Not complete before 1979 and we're not interested in anything beyond 2008
		drop if open_year<=1978 | open_year>=2010
	
	*Merge in violations
		merge 1:1 activity_nr using `viols', keep(master match) gen(merge_violations)
		foreach v of varlist *_viol gravity {
			replace `v' = 0 if merge_violations==1
		}
		drop merge_viol
		
	*Generate quarter of year
		gen year_quarter =qofd(open_date)
		format year_quarter %tq

	*Sector distrinction
		gen sic2 = substr(sic_code, 1, 2)
		destring sic2, replace

	*Drop mining, since not under the jurisdiction of OSHA
		drop if sic2>=10 & sic2<=14

	*Want to restrict to private companies - however, the problem is that we can't do that pre-1985
	*Try to manually delete government inspections pre 1985 using common strings
		drop if owner_type!="" & owner_type!="A"
		drop if owner_type=="" & regexm(estab_name, "JAIL")
		drop if owner_type=="" & regexm(estab_name, "POST OFFICE")
		drop if owner_type=="" & regexm(estab_name, "POSTAL SERVICE")
		drop if owner_type=="" & regexm(estab_name, "FIRE DEPT")
		drop if owner_type=="" & regexm(estab_name, "FIRE STATIO")
		drop if owner_type=="" & regexm(estab_name, "POLICE")
		drop if owner_type=="" & regexm(estab_name, "SCHOOL")
		drop if owner_type=="" & regexm(estab_name, "ARMY")
		drop if owner_type=="" & regexm(estab_name, "AIR FORCE")
		drop if owner_type=="" & regexm(estab_name, "VETERAN")
		drop if owner_type=="" & regexm(estab_name, "NATIONAL GUARD")
		drop if owner_type=="" & regexm(estab_name, "NAVY") | regexm(estab_name, "NAVAL")
		drop if owner_type=="" & regexm(estab_name, "ADMINISTRATION")
		drop if owner_type=="" & substr(sic_code, 1, 1)=="9"
		
	gen complaint = insp_type=="B"
		lab var complaint "complaint inspection"
	gen fatcat = insp_type=="A"
		lab var fatcat "fatality or catastrophe inspection"
	gen programmed = insp_type=="H" | insp_type=="I" | insp_type=="K"
	*Indicator for fatcats at union estabslshments
		gen fatcat_union = fatcat==1 & (union_status=="A" | union_status=="Y")

	*Create a distinction of mfg, and all others. Later, we'll make category 1 be "total"
		gen sector_coarse = 2 if sic2>=20 & sic2<=39
		replace sector_coarse = 3 if sector_coarse==.

******************************
	*Save a cleaned up version before collapsing 
******************************	
	tempfile imis 
	save `imis', replace 
	
	
	*Monthly file for the WDLs
		u "${replication_cleandata}EPL_monthlyDS.dta", clear
		gen month_year = ym(year, month)
		format month_year %tm
		keep state2 month_year wdla* 
		rename state2 site_state
		tempfile wdl 
		save `wdl', replace 
	 
	*Annual file for the statutes: also get census division and region out of this 
		u "${replication_cleandata}osha_nsc2.dta", clear
		drop if state2==""
		keep if sector==1
		keep state2 year wb* wc* region division 
		foreach v of varlist region division {
		    rename `v' census_`v'
		}
		rename year open_year 
		rename state2 site_state
		tempfile statutes 
		save `statutes', replace 

	use `imis', clear 	
	gen month_year = mofd(open_date)
	format month_year %tm
	
	merge m:1 site_state month_year using `wdl', keep(master match) gen(merge_wdl)
	merge m:1 site_state open_year using `statutes', keep(master match) gen(merge_statute)
	
	egen state_code = group(site_state)
	
				gen region = .
				replace region = 1 if site_state=="CT" | site_state=="MA" | site_state=="ME" | site_state=="NH" | site_state=="RI" | site_state=="VT"
				replace region = 2 if site_state=="NJ" | site_state=="NY" | site_state=="PR" | site_state=="VI"
				replace region = 3 if site_state=="DC" | site_state=="DE" | site_state=="MD" | site_state=="PA" | site_state=="VA" | site_state=="WV"
				replace region = 4 if site_state=="AL" | site_state=="FL" | site_state=="GA" | site_state=="KY" | site_state=="MS" | site_state=="NC" | site_state=="SC" | site_state=="TN"
				replace region = 5 if site_state=="IL" | site_state=="IN" | site_state=="MI" | site_state=="MN" | site_state=="OH" | site_state=="WI"
				replace region = 6 if site_state=="AR" | site_state=="LA" | site_state=="NM" | site_state=="OK" | site_state=="TX"
				replace region = 7 if site_state=="IA" | site_state=="KS" | site_state=="MO" | site_state=="NE"
				replace region = 8 if site_state=="CO" | site_state=="MT" | site_state=="ND" | site_state=="SD" | site_state=="UT" | site_state=="WY"
				replace region = 9 if site_state=="AZ" | site_state=="CA" | site_state=="HI" | site_state=="NV" | site_state=="GU"
				replace region = 10 if site_state=="AK" | site_state=="ID" | site_state=="OR" | site_state=="WA"
			lab var region "OSHA region"

	gen union = union_status=="A" | union_status =="Y"

	gen partial_insp = insp_scope=="B"
	gen records_only_insp = insp_scope=="C"
	
	gen ln_nr_in_estab = ln(nr_in_estab+1)
	gen asinh_gravity  = asinh(gravity )
	
	lab var ln_nr_in_estab "Log \# workers present"
	lab var union "Union present"

	*Even coarser industry code 
		gen sic1 = substr(sic_code, 1, 1)
		destring sic1, replace
		
		*clean up variable labels 
			lab var wdlap "Pubic policy exception"
			lab var wdlag "Good faith exception"
			lab var wdlac "Implied contract exception"

*save ${wrongful_discharge}imis_for_wdl_analysis.dta, replace 
	drop if open_year>=2008
	
	save ${replication_cleandata}osha_inspection_cleaned.dta, replace 
	
	u `imis', clear 
		
		
*First, collapse to mfg/non-mfg. Then, collapse to "overall" and append together
	preserve
		*First, mfg/non-mfg
			collapse (max) state_run_office (sum) num_complaint_insp = complaint num_accident_insp = fatcat num_prgm_insp = programmed num_union_acc_insp = fatcat_union , by(site_state year_quarter sector_coarse) 

		tempfile mfg
		save `mfg', replace
	restore
		*Next, overall
			collapse (max) state_run_office (sum) num_complaint_insp = complaint num_accident_insp = fatcat num_prgm_insp = programmed num_union_acc_insp = fatcat_union , by(site_state year_quarter) 
		gen sector_coarse = 1
	append using `mfg'

	sort state year_quarter sector_coarse
	label define sector_coarse_labs 1 "Overall" 2 "Manufacturing" 3 "Non-manufacturing"
	label values sector_coarse sector_coarse_labs

*Create a balanced panel
	rename site_state state

	egen state_sector_group = group(state sector_coarse)
	tsset state_sector_group  year_quarter
	tsfill, full
	
	egen state_fill = mode(state), by(state_sector_group)
	replace state = state_fill if state==""
	drop state_fill

	egen sector_fill = mode(sector_coarse), by(state_sector_group)
	replace sector_coarse = sector_fill if sector_coarse==.
	drop sector_fill
	
	foreach v of varlist num_complaint_insp num_accident_insp num_prgm_insp num_union_acc_insp { 
		replace `v' = 0 if `v'==.
	}
	egen max_state = max(state_run_office), by(state)
	drop state_run_office
	rename max_state state_run_office

lab var state_run_office " State not in federal OSHA jurisdiction"
lab var num_complaint_insp "Number of complaint inspections"
lab var num_accident_insp "Number of accident inspections"
lab var num_prgm_insp "Number of programmed inspections"
lab var num_union_acc_insp "Number of accident inspections of union establishments"

drop state_sector_group

format year_quarter %tq

gen year = year(dofq(year_quarter))
gen quarter = quarter(dofq(year_quarter))

saveold ${replication_cleandata}complaint_accident_insps_state_year.dta, replace version(12)
